library(tidyverse)
library(janitor)
library(plotly)
library(forecast)
library(prophet)
# Creating a vector of file pathes
file_pathes <- list.files(path = "G:\\My Drive\\Projects\\smokingandcaner\\datasets\\iarc", pattern = "*.csv", full.names = TRUE)
# Looping and importeing the datasets
for (i in 1:length(file_pathes)){
assign(paste0("dataset", i), read_csv(file_pathes[i]))
}
# List all the datasets in the current working directory with a similar prefix
datasets <- ls(pattern="^dataset")
# Use lapply and rbind to combine the datasets into a single dataset
combined_data <- do.call(rbind, lapply(datasets, get))
combined_data <- combined_data %>% clean_names()
##
## Australia France Japan New Zealand
## 138 134 138 134
## Republic of Korea Singapore United Kingdom USA
## 68 110 134 136
male_data <- combined_data %>% filter(sex == 1)
male_plot <- ggplot(male_data) + geom_smooth(aes(year, asr_world, color=country_label))
ggplotly(male_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
female_data <- combined_data %>% filter(sex == 2)
female_plot <- ggplot(female_data) + geom_smooth(aes(year, asr_world, color=country_label))
ggplotly(female_plot)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
The methodology I used for forecasting involved several steps. Initially, I decided to use the Prophet package instead of ARIMA since the data was non-stationary, and Prophet has the capability to handle such data.
Next, I focused on forecasting the smoking prevalence for males in Japan using the Prophet package. I started by selecting the relevant data columns and renaming them to match the required input format for the package. Then, I converted the data to the time series format using the as.Date function. I used the resulting time series as the input to the Prophet model, and used the make_future_dataframe function to generate the future data points.
For the second step, I utilized the forecasted smoking prevalence data and the historical data of age-standardized mortality to forecast the age-standardized mortality in Japan. I incorporated the forecasted smoking prevalence data as an additional regressor in the Prophet model, as it was expected to have a significant impact on the mortality rate.
Overall, the methodology I followed involved using the Prophet package to model the non-stationary time series data, and incorporating additional regressors to improve the forecasting accuracy. ### creating the time-series for japan
mortality <- read.csv("datasets/iarc/japan/mort.csv")
mortality <- mortality %>%
clean_names() %>%
select(year, asr_world, sex) %>%
mutate(sex = factor(sex, labels = c("male", "female")))
# plotting the asr_world per year for each sex in japan
ggplot(mortality) +
geom_line(aes(y = asr_world, x = year, colour = sex)) +
labs(title = "Japan Mortality Rate", x = "Year", y = "ASR Mortality")
smoking <- read.csv("datasets/iarc/japan/smokingpre_updated.csv") %>%
rename(year = Year) %>%
select(year, sex, smoker) %>%
mutate(sex = factor(sex, labels = c("male", "female")))
# plotting the number of smoker per sex for each year
ggplot(smoking) +
geom_line(aes(y = smoker, x = year, colour = sex)) +
labs(title = "Smoking in Japan", x = "Year", y = "Prevelance of Smoking")
# merging the two datasets
total_epi <- mortality %>%
full_join(smoking, by=c("year", "sex"))
#
ggplot(total_epi) +
geom_line(aes(y = asr_world, x = year, colour = sex)) +
geom_line(aes(y = smoker, x = year, colour = sex)) +
labs(title = "Japan Mortality Rate and Smoking", x = "Year")
## Warning: Removed 30 rows containing missing values (`geom_line()`).
total_epi <- total_epi %>% filter(sex == "male")
total_epi <- na.omit(total_epi)
df <- total_epi %>%
select(year, smoker) %>%
rename(ds = year, y = smoker) %>%
mutate(ds = as.Date(paste0(ds, "-01-01"), format = "%Y-%m-%d"))
m <- prophet(df)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 20, freq = "year")
forecast <- predict(m, future)
p <- plot(m, forecast)
p <- p + geom_vline(xintercept = forecast$ds[forecast$yhat < 0][1], linetype = "dashed", color = "red")
p <- p + labs(title = "Forecasting Smoking Prevelance in Japan", x = "Year", y = "Smoking Prevelance")
p <- p + annotate(geom = "text", x = forecast$ds[forecast$yhat < 0][1], y = 0, label = as.character(year(forecast$ds[forecast$yhat < 0][1])))
ggplotly(p)
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
total_epi_prophet <- total_epi %>%
select(year, asr_world, smoker) %>%
rename(ds = year, y = asr_world, regressor = smoker)
total_epi_prophet$ds <- as.Date(paste0(total_epi_prophet$ds, "-01-01"), format = "%Y-%m-%d")
total_epi_prophet <- na.omit(total_epi_prophet)
m <- prophet()
m <- add_regressor(m, 'regressor')
m <- fit.prophet(m, total_epi_prophet)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 20, freq = "year")
total_epi <- na.omit(total_epi)
future$regressor <- forecast$yhat
forecast_mort <- predict(m, future)
A <- plot(m, forecast_mort)
A <- A + geom_vline(xintercept = forecast_mort$ds[forecast_mort$yhat < 0][1], linetype = "dashed", color = "red")
A <- A + labs(title = "Forecasting Lung Cancer Mortality in Japan", x = "Year", y = "Lung Cancer Mortality")
A <- A + annotate(geom = "text", x = forecast_mort$ds[forecast_mort$yhat < 0][1], y = 0, label = as.character(year(forecast_mort$ds[forecast_mort$yhat < 0][1])))
ggplotly(A)
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.2.3
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
# combine the two plots horizontally
plot_combined <- plot_grid(p, A, ncol = 2)
## Warning: Removed 1 rows containing missing values (`geom_vline()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).
# add title and axis labels
plot_combined <- plot_combined + ggtitle("Smoking Prevalence and Lung Cancer Mortality in Japan") +
xlab("Year") + ylab("Rate")
# display the combined plot
plot_combined